In [1]:
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)
Out[1]:
In [2]:
import pandas as pd
import numpy as np
import cPickle as pickle
import json
In [12]:
jsonpath = '/Users/rcn/Desktop/twitter-analysis/data/raw/tweets.json'
In [21]:
tweets = pd.read_json(jsonpath, orient='records')
In [20]:
tweets = pd.io.json.json_normalize(jsonpath)
In [15]:
print('We have %d tweets in total' % len(tweets))
In [22]:
twitterData = pd.DataFrame(tweets)
In [23]:
twitterData.head()
Out[23]:
In [24]:
text =
Out[24]:
In [18]:
twitterData.dtypes
Out[18]:
In [10]:
#twitterData.twitter_mentions=
#twitterData[30:45].twitter_mentions.str.split(',').astype(list).astype('str')
#twitterData.twitter_mentions_list=twitterData.twitter_mentions.str.split(',').astype(list).astype('str')
#twitterData.twitter_mentions_list=twitterData.twitter_mentions.apply(lambda x: list(str(x).split(',')))
#twitterData.twitter_mentions_list[40:45].get_values()
twitterData.entities.user_mentions[40:45].get_values()
In [10]:
pd.Series.get_values
Out[10]:
In [8]:
(twitterData.twitter_mentions_list[44:45]).get_values()[0][0]
Out[8]:
In [9]:
twitterData.describe()
Out[9]:
In [10]:
nTweets = len(twitterData.index)
print "There are", nTweets, "tweets in the full dataset"
In [11]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
In [9]:
#twitterData['Friends'].plot()
In [12]:
from bokeh.plotting import *
output_notebook()
In [13]:
tweets = twitterData.id.count()
tweets
Out[13]:
In [14]:
from ggplot import *
%matplotlib inline
In [18]:
figure(
title='Number of Tweets', # Plot title
title_text_font='Courier New', # Title font
title_text_color='#5d6263', # Title font colour
plot_width=1000, # Plot width
plot_height=600, # Plot height
background_fill='#f6f6f6', # Background colour
border_fill='#f6f6f6', # Border background
border_symmetry='hv', # h=horizontal, v=vertical
outline_line_color='#f6f6f6', # Plot area border colour
x_axis_type = 'datetime', # For timeseries only
tools='pan,box_zoom,previewsave,resize,select,reset' # Available: pan,wheel_zoom,box_zoom,previewsave,resize,select,reset
)
hold()
line(
twitterData.time, # x
twitterData.id, # y
color='#00aeef', # Line colour
line_width=3, # Line width in px
legend='Tweets', # Legend label
)
legend().label_text_font='Courier New'
legend().label_text_color='#5d6263'
legend().label_outline_line_color='#f6f6f6'
yaxis().axis_line_color = None
xaxis().axis_line_color = '#d4d4d4'
axis().major_label_text_font="Courier New"
axis().major_label_text_font_size="12pt"
xgrid().grid_line_color = None
ygrid().grid_line_color = "#d4d4d4"
ygrid().grid_line_width = 1
show()
In [22]:
output_file("friends.html", title="timeseries example")
hold()
line(
twitterData['time'], # x coordinates
twitterData['friends'], # y coordinates
color='#A6CEE3', # set a color for the line
legend='Friends', # attach a legend label
x_axis_type = "datetime", # NOTE: only needed on first
tools="pan,wheel_zoom,box_zoom,reset,previewsave" # NOTE: only needed on first
)
Out[22]:
In [23]:
x = twitterData['time']
y = twitterData.cumsum()
line(x,y, color="#0000FF", tools=[])
show()
In [29]:
# Languages included in taxonomy: en, hi, ur, sw, ha, ig, yo
nDataSiftType = twitterData.type.value_counts(normalize=True, sort=True, ascending=False, bins=None)
nDataSiftType[0:10]
Out[29]:
In [25]:
# Languages included in taxonomy: en, hi, ur, sw, ha, ig, yo
nDataSiftLanguage = twitterData.datasift_lang.value_counts(normalize=True, sort=True, ascending=False, bins=None)
nDataSiftLanguage[0:10]
Out[25]:
In [26]:
nTwitterLanguage = twitterData.twitter_lang.value_counts(normalize=True, sort=True, ascending=False, bins=None)
nTwitterLanguage[0:10]
Out[26]:
In [27]:
# Reminder: Locations we are interested in are "IN", "PK", "NG", and "KE".
nLocation = twitterData.twitter_location.value_counts(normalize=False, sort=True, ascending=False, bins=None)
nLocation[0:15]
Out[27]:
In [28]:
# Reminder: Locations we are interested in are "IN", "PK", "NG", and "KE".
nUngpLocation = twitterData.UNGP_location.value_counts(normalize=False, sort=True, ascending=False, bins=None)
nUngpLocation[0:15]
Out[28]:
In [37]:
# Getting Vincent ready
vincent.initialize_notebook()
gpBlue='#00aeef'
gpLightGray='#96999b'
gpDarkBlue='#00447c'
gpRed='#cf5c42'
gpBrown='#e1d8ad'
gpPink='#f4d5e3'
gpLightBlue='#e1f4fd'
In [39]:
location_grouped = twitterData.groupby('UNGPLocation')
mean_location_grouped = location_grouped.mean().dropna()
mean_followers = mean_location_grouped.sort('Followers')['Followers']
followersBar = vincent.Bar(mean_followers)
followersBar.axis_titles(x='Country', y='Followers')
from vincent.axes import AxisProperties
from vincent.properties import PropertySet
from vincent.values import ValueRef
for axis in followersBar.axes:
axis.properties = AxisProperties()
for prop in ['ticks', 'axis', 'major_ticks', 'minor_ticks']:
setattr(axis.properties, prop, PropertySet(stroke=ValueRef(value=gpLightGray)))
axis.properties.title = PropertySet(font_size=ValueRef(value=20),
fill=ValueRef(value=gpLightGray))
axis.properties.labels = PropertySet(fill=ValueRef(value=gpLightGray))
followersBar.axes[0].properties.labels.angle = ValueRef(value=0)
followersBar.axes[0].properties.labels.align = ValueRef(value='center')
followersBar.axes[0].properties.title.dy = ValueRef(value=20)
followersBar.scales[2].range = [gpBlue]
followersBar.to_json('../charts/followersBar.json')
followersBar
Out[39]:
In [40]:
location_grouped = twitterData.groupby('UNGPLocation')
mean_location_grouped = location_grouped.mean().dropna()
mean_friends = mean_location_grouped.sort('Friends')['Friends']
friendsBar = vincent.Bar(mean_friends)
friendsBar.axis_titles(x='Country', y='Friends')
for axis in friendsBar.axes:
axis.properties = AxisProperties()
for prop in ['ticks', 'axis', 'major_ticks', 'minor_ticks']:
setattr(axis.properties, prop, PropertySet(stroke=ValueRef(value=gpLightGray)))
axis.properties.title = PropertySet(font_size=ValueRef(value=20),
fill=ValueRef(value=gpLightGray))
axis.properties.labels = PropertySet(fill=ValueRef(value=gpLightGray))
friendsBar.axes[0].properties.labels.angle = ValueRef(value=0)
friendsBar.axes[0].properties.labels.align = ValueRef(value='center')
friendsBar.axes[0].properties.title.dy = ValueRef(value=20)
friendsBar.scales[2].range = [gpDarkBlue]
friendsBar.to_json('../charts/friendsBar.json')
friendsBar
Out[40]:
In [41]:
location_grouped = twitterData.groupby('UNGPLocation')
mean_location_grouped = location_grouped.mean().dropna()
mean_genderProb = mean_location_grouped.sort('UNGPGenderProb')['UNGPGenderProb']
genderProb = vincent.Bar(mean_genderProb)
genderProb.axis_titles(x='Country', y='Average Gender Probablility')
for axis in genderProb.axes:
axis.properties = AxisProperties()
for prop in ['ticks', 'axis', 'major_ticks', 'minor_ticks']:
setattr(axis.properties, prop, PropertySet(stroke=ValueRef(value=gpLightGray)))
axis.properties.title = PropertySet(font_size=ValueRef(value=20),
fill=ValueRef(value=gpLightGray))
axis.properties.labels = PropertySet(fill=ValueRef(value=gpLightGray))
genderProb.axes[0].properties.labels.angle = ValueRef(value=0)
genderProb.axes[0].properties.labels.align = ValueRef(value='center')
genderProb.axes[0].properties.title.dy = ValueRef(value=20)
genderProb.scales[2].range = [gpRed]
genderProb.to_json('../charts/genderProbBar.json')
genderProb
Out[41]:
In [42]:
mpld3.enable_notebook()
gatesCountry = twitterData.UNGPLocation.value_counts(normalize=False, sort=True, ascending=False, bins=None)
gatesCountryFig = gatesCountry.plot(kind='barh', color='#00aeef')
mpld3.display()
In [70]:
import ggplot as gg
(ggplot(gg.aes(x='UNGPLocation'), data=twitterData)
+ gg.geom_bar() + gg.ggtitle("Gates Tweets")
+ gg.labs("Country", "Number of tweets"))
In [77]:
languagePlot = ggplot(aes(x='DataSiftLanguage'), data=twitterData) + geom_bar() + ggtitle("Language Distribution") + labs("Language", "Number of tweets")
languagePlot
In [1]:
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)
Out[1]: